Since we know what the data is, let’s try a 3d plot… colored by gender
p <- plotly::plot_ly(data = measure, x = ~chest, y = ~waist, z = ~hips , type = 'scatter3d',mode = 'markers',color = ~gender)
# Sys.setenv("plotly_username"="srivathsesh")
# Sys.setenv("plotly_api_key"="0Sr49roqMwTtuQl3JZ9G")
# api_create(p, filename = "r-clusterAnalysis")
# knitr::include_graphics('https://plot.ly/~srivathsesh/5.embed')
p
Let’s see if we can create the visual clustering mathematically…
Lets start with the eucledian distance. Lets manually create the euclidean distance between first row and the second row \[\sqrt{(measure[1,1] - measure[2,1])^2 + ( measure[1,2] - measure[2,2])^2 + (measure[1,3] - measure[2,3])^2}\] 6.164414
# create a matrix of the euclidean distances
dm <- dist(measure[1:3], method = "euclidean" )
single <- hclust(dm,method = "single")
plot(single,main = "Single")
Complete <- hclust(dm,method = "complete")
plot(Complete,main = "Complete")
Average <- hclust(dm,method = "average")
#plot(as.dendrogram(Average),main = "Average",horiz = T)
plot(Average,main = "Average")
pc <- princomp(measure[,1:3],cor = T)
summary(pc)
## Importance of components:
## Comp.1 Comp.2 Comp.3
## Standard deviation 1.4391057 0.7952666 0.54454193
## Proportion of Variance 0.6903417 0.2108163 0.09884197
## Cumulative Proportion 0.6903417 0.9011580 1.00000000
pc2 <- princomp(dm, cor = T)
summary(pc2)
## Importance of components:
## Comp.1 Comp.2 Comp.3 Comp.4
## Standard deviation 3.4600702 2.1000928 1.4984859 0.80761233
## Proportion of Variance 0.5986043 0.2205195 0.1122730 0.03261188
## Cumulative Proportion 0.5986043 0.8191238 0.9313968 0.96400865
## Comp.5 Comp.6 Comp.7 Comp.8
## Standard deviation 0.440832659 0.403036145 0.302905493 0.269054424
## Proportion of Variance 0.009716672 0.008121907 0.004587587 0.003619514
## Cumulative Proportion 0.973725322 0.981847229 0.986434816 0.990054330
## Comp.9 Comp.10 Comp.11 Comp.12
## Standard deviation 0.213776213 0.210015148 0.163842236 0.146733170
## Proportion of Variance 0.002285013 0.002205318 0.001342214 0.001076531
## Cumulative Proportion 0.992339344 0.994544662 0.995886876 0.996963407
## Comp.13 Comp.14 Comp.15 Comp.16
## Standard deviation 0.1332257792 0.1138641186 0.0982936332 0.0831398915
## Proportion of Variance 0.0008874554 0.0006482519 0.0004830819 0.0003456121
## Cumulative Proportion 0.9978508622 0.9984991140 0.9989821959 0.9993278080
## Comp.17 Comp.18 Comp.19 Comp.20
## Standard deviation 0.0755832855 0.0629578455 0.0613784668 0
## Proportion of Variance 0.0002856417 0.0001981845 0.0001883658 0
## Cumulative Proportion 0.9996134497 0.9998116342 1.0000000000 1
measure$pc1 <- pc$scores[,1]
measure$pc2 <- pc$scores[,2]
measure$dmpc1 <- pc2$scores[,1]
measure$dmpc2 <- pc2$scores[,2]
measure$single <- cutree(single,h = 3.6)
measure$complete <- cutree(Complete, h = 10)
measure$avg <- cutree(Average, h = 7.9)
plot_ly(data = measure,x = ~pc1, y = ~pc2, color = ~gender, type = 'scatter', mode = 'markers',colors = "Set1") %>%
add_text(text = measure$avg, textposition = "top right",showlegend = F) %>%
layout(title = 'PCA using raw data and class labels based on Average')
plot_ly(data = measure,x = ~dmpc1, y = ~dmpc2, color = ~gender, type = 'scatter', mode = 'markers',colors = "Set1") %>%
add_text(text = measure$avg, textposition = "top right",showlegend = F) %>%
layout(title = 'PCA using Eucleadian distances and class labels based on Average')
## K-Means Clustering - Textbook example
crime <- readRDS(file = 'crime.rds')
symbol <- rep(20,51)
symbol[which(crime$Murder > 15)] <- 3
pairs(crime,pch = symbol)
The “+” symbol indicates the record for MD… did you see that its an outlier. Lets leave that out data point
crime_subset <- dplyr::filter(crime, rownames(crime) != 'DC')
rownames(crime_subset) <- rownames(crime[which(rownames(crime) != 'DC'),])
sapply(crime_subset,'var')
## Murder Rape Robbery Assault Burglary
## 11.93492 209.76335 11889.56122 19373.53510 175895.00449
## Theft Vehicle
## 565276.55878 43997.35878
See the variance is different for the different variables? It necessary standardize the variables… Not necessarily scale it. The scaling doesn’t matter as we are interested in the distances within each axis (variable… see the euclidean formula above.)
# Get the ranges for each column
rge <- sapply(crime_subset, function(x) diff(range(x)))
crime_s <- sweep(crime_subset,2,rge,FUN = "/")
rownames(crime_s) <- rownames(crime_subset)
n <- nrow(crime_s)
OverallSS <- (n-1) * sum(sapply(crime_s,var))
kmeanscree <- function(data,center) {
withinss <- purrr::map(.x = center, .f = function(x) kmeans(data,center =x)$tot.withinss)
withinss <- c((nrow(data) - 1) * sum(sapply(data,var)),withinss)
plot(x = c(1,center), y = unlist(withinss),xlab = 'Number of clusters',ylab = 'Within cluster Sum of Squares',type = 'b' )
}
kmeanscree(data = crime_s,center = 2:6)
2 clusters form the elbow of the scree plot. We’ll use the 2 clusters for the kmean clustering.
kclust <- kmeans(x = crime_s, centers = 2)
# converting back to non standardized units
kclust$center*rge
## Murder Rape Robbery Assault Burglary Theft Vehicle
## 1 9.368182 376.4971 829.2781 555.98881 51.36291 640.0962 2070.7959
## 2 23.165629 232.6096 438.9973 3.97189 255.21604 1561.7962 247.0357
# PCA
pca.crime <- princomp(x = crime_s,cor = F)
summary(pca.crime)
## Importance of components:
## Comp.1 Comp.2 Comp.3 Comp.4
## Standard deviation 0.5162052 0.2384135 0.20256101 0.13721052
## Proportion of Variance 0.6477705 0.1381778 0.09974432 0.04576688
## Cumulative Proportion 0.6477705 0.7859483 0.88569258 0.93145946
## Comp.5 Comp.6 Comp.7
## Standard deviation 0.10914469 0.10332169 0.07487990
## Proportion of Variance 0.02895887 0.02595132 0.01363035
## Cumulative Proportion 0.96041833 0.98636965 1.00000000
# populate df for plotting
df.pca <- data.frame(pc1 = pca.crime$scores[,1], pc2 = pca.crime$scores[,2],clusters = as.factor(kclust$cluster),Index = rownames(crime_s))
ggplot(df.pca,mapping = aes(x = pc1, y = pc2, color = clusters,label = Index)) + geom_point() + geom_text(size = 3) + theme_bw()